import pandas as pd
import matplotlib.pyplot as plt
import gmaps
low_memory=False
Given our datasets, we want to describe them in great detail to understand the patterns of weather and biking.
Analyzing them standalone will give us insights on how the data are structured and whether there have been any outliers due to events in the area. For instance, an unusually high number of bike rides might be observed due to a concert or a public event in that area. Contrastingly, the recent California wildfires would have reduced the number of bikers.
Furthermore, The ability to predict the number of hourly users can allow the businesses of government that oversee these systems to manage them in a more efficient and cost-effective manner.
In the next phase of this project, our goal is to use weather information to effectively predict the number of ride-sharing bikes that will be used in any given period, using available information about that time/day. We can generate graphs of trips that take place between the most popular stations. Also we could generate a general heat map of the stations signifying demand and supply throughout the Bay Area.
desc = pd.read_csv("C:/Users/pujar/Dropbox/Semester 2/Python-BUDT758X/Pawject/data/desc_bike.csv",encoding='utf-8')
desc
desc = pd.read_csv("C:/Users/pujar/Dropbox/Semester 2/Python-BUDT758X/Pawject/data/desc_weather.csv",encoding='utf-8')
desc
wdf = pd.read_csv("C:/Users/pujar/Dropbox/Semester 2/Python-BUDT758X/Pawject/data/SF_Weather_2018.csv",encoding='utf-8')
bdf = pd.read_csv('C:/Users/pujar/Dropbox/Semester 2/Python-BUDT758X/Pawject/data/all.csv', encoding='utf-8', sep=',')
#Creating more columns to process the timestamp to date and time separately
bdf.insert(2, 'start_date', 0)
bdf.insert(3, 'end_date', 0)
bdf.insert(4, 'start_timing', 0)
bdf.insert(5, 'end_timing', 0)
# Converting timestamp to string type for easy REGEX-ing (if that's a term!)
bdf['start_time'] = bdf['start_time'].astype(str)
bdf['end_time'] = bdf['end_time'].astype(str)
bdf['start_date'] = bdf['start_time'].str.extract('(\d\d\d\d-\d\d-\d\d)')
bdf['start_timing'] = bdf['start_time'].str.extract('(\d\d:\d\d:\d\d.\d\d\d\d)')
bdf['end_date'] = bdf['end_time'].str.extract('(\d\d\d\d-\d\d-\d\d)')
bdf['end_timing'] = bdf['end_time'].str.extract('(\d\d:\d\d:\d\d.\d\d\d\d)')
# NOTE- Pandas seems to re-add CSV headers every 100,000 or so rows.
# Therefore, we have removed the rows with repeating headers.
bdf = bdf[bdf['start_time'] != 'start_time']
# Merge the two dataframes to get weather data corresponding to bike rides
wdf['Date'] = pd.to_datetime(wdf['Date']).dt.strftime('%Y-%m-%d')
df = pd.merge(bdf, wdf, left_on='start_date', right_on='Date')
del wdf['New Snow']
del wdf['Snow Depth']
wdf.head()
Source: https://www.popsci.com/fires-california-air-quality-cigarettes
from itertools import cycle, islice
# Using different colors to represent data
mycolors= list(islice(cycle(['#6C5B7B']), None, len(df)))
# Extract the month using REGEX
ser = df['start_date'].str.extract("(-\d{1,2})")
ser[0] = ser[0].str.replace('-','')
ser = ser[0].groupby(ser[0]).count()
# Plot
plt.figure(figsize=(19,7), facecolor='white')
plt.title("Popular biking months")
plt.ylabel('Frequency of rides')
plt.xlabel('Month')
ser.plot.bar(x='lab', y='val', rot=0, color=mycolors)
plt.axvspan(8.5, 10.5, color='black', alpha=0.5)
plt.show()
#Create a new dataset that has count the date as the number of rides
df1=df.Date.value_counts()
df1=df1.to_frame()
df1=df1.rename(columns={'Date':'Number of Bike rides'})
df1.index.names = ['Date']
#sort index as date
df1=df1.sort_index()
from datetime import datetime
import matplotlib.dates as mdate
import matplotlib as mpl
import datetime as dt
import matplotlib.pyplot as py
#Generate two plot--weather pattern and daily usage of bike rides
fig = plt.figure(figsize=(18,12),facecolor='white')
ax1 = fig.add_subplot(2,1,1) # Add the first plot on the top
df1.index = pd.to_datetime(df1.index, format='%Y-%m-%d')#Transform the data type and used for changing the x axis later
plt.title('Daily usage of bike rides - 2018, SF',fontsize=16)
plt.xlabel('Date',fontsize=12)
#Change the x axis as months
ax1.xaxis.set_major_formatter(mdate.DateFormatter('%b %Y'))
dateFmt = mpl.dates.DateFormatter('%Y-%m')
ax1.xaxis.set_major_formatter(dateFmt)
#Give the range of x axis from 2018-1-1 to 2018-12-31 and set the frequncy of x label as months
plt.xticks(pd.date_range('2018-01','2018-12',freq='MS'),rotation=30)
plt.ylabel("Number of bike rides",fontsize=12)
plt.ylim([0,8500])
plt.plot(df1.index,df1['Number of Bike rides'],label='Number of bike rides',c='blue',alpha=0.3)
#Add the second plot on the bottom
fig = plt.figure(figsize=(18,12), facecolor='white')
ax = fig.add_subplot(2,1,2)
wdf['Date'] = pd.to_datetime(wdf['Date'], format='%Y-%m-%d')
plt.plot(wdf['Date'],wdf['Maximum'],label='Maximum temperature',c='#F8B195',alpha=0.5)
plt.plot(wdf['Date'],wdf['Minimum'],label='Minimum temperature',c='#C06C84',alpha=0.5)
plt.title('Daily high and low temperatures - 2018, SF',fontsize=16)
plt.xlabel('Date',fontsize=16)
#Change the x axis again to months
plt.gca().xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(mdate.MonthLocator())
#Give the range of x axis from 2018-1-1 to 2018-12-31 and set the frequncy of x label as months
plt.xticks(pd.date_range('2018-01-01','2018-12-31',freq="MS"),rotation=30)
plt.ylabel("Temperature ( F )",fontsize=12)
plt.ylim([20,100])
plt.tick_params(axis='both',which='major',labelsize=12)
plt.legend()
plt.show()
# Extract the hour of the day using REGEX
ser = df['start_timing'].astype('str')
ser = ser.str.extract("(\d{1,2})")
ser = ser[0].groupby(ser[0]).count()
# Plot
plt.figure(figsize=(19,7), facecolor='white')
plt.title("Popular biking hours")
plt.ylabel('Frequency of rides')
plt.xlabel('Hour of the day')
ser.plot.bar(x='lab', y='val', rot=0, color=mycolors)
plt.show()
print(df['start_station_name'].value_counts().sort_values(ascending=False).head(10))
import gmaps
import gmaps.datasets
key = "AIzaSyAuS0BiPZ4uvjhE3ZHLpO7_ZFzVlW_THJ4"
# Use google maps api
gmaps.configure(api_key=key) # Fill in with your API key
#set up locations
start_locations = df[['start_station_latitude', 'start_station_longitude']]
start_df=start_locations.sample(400000)
start_df1= start_df.groupby(by=['start_station_latitude','start_station_longitude'])
start_df2= pd.DataFrame(start_df1.size())
start_df2.rename(columns={0: 'Numbers'}, inplace=True)
start_df2=start_df2.reset_index()
start_df2['Numbers']=start_df2['Numbers'].astype('float')
start_df2['start_station_latitude']=start_df2['start_station_latitude'].astype('float')
start_df2['start_station_longitude']=start_df2['start_station_longitude'].astype('float')
#set up map
start_fig = gmaps.figure()
heatmap_layer = gmaps.heatmap_layer(
start_df2[['start_station_latitude','start_station_longitude']], weights=start_df2['Numbers'],
max_intensity=100, point_radius=5.0
)
start_fig.add_layer(heatmap_layer)
start_fig
print(df['end_station_name'].value_counts().sort_values(ascending=False).head(10))
#set up locations
end_locations = df[['end_station_latitude','end_station_longitude']]
end_df=end_locations.sample(400000)
end_df1= end_df.groupby(by=['end_station_latitude','end_station_longitude'])
end_df2= pd.DataFrame(end_df1.size())
end_df2.rename(columns={0: 'Numbers'}, inplace=True)
end_df2=end_df2.reset_index()
end_df2['Numbers']=end_df2['Numbers'].astype('float')
end_df2['end_station_latitude']=end_df2['end_station_latitude'].astype('float')
end_df2['end_station_longitude']=end_df2['end_station_longitude'].astype('float')
#set up map
end_fig = gmaps.figure()
heatmap_layer = gmaps.heatmap_layer(
end_df2[['end_station_latitude','end_station_longitude']], weights=end_df2['Numbers'],
max_intensity=100, point_radius=5.0
)
end_fig.add_layer(heatmap_layer)
end_fig
# Replace garbage values with something meaningful- Some more cleaning done
df['member_gender'] = df['member_gender'].str.replace('member_gender','Unknown')
df_gender = df['member_gender'].dropna()
# Get categories and their frequencies
df_gender = df_gender.value_counts().sort_values(ascending=False)
plt.figure(figsize=(19,7), facecolor='white')
plt.title("Gender Overview")
plt.ylabel('Number')
plt.xlabel('Gender')
mycolors= list(islice(cycle(['#F8B195', '#F67280', '#C06C84']), None, len(df)))
ax = df_gender.plot.bar(x='lab', y='val', rot=0, color=mycolors)
plt.show()
import numpy as np
import datetime
# Calculate member ages- certain ages are NaN and hence dropped
now = datetime.datetime.now()
df_clean_age = df[df['member_birth_year'] != 'member_birth_year']['member_birth_year']
df_clean_age = df_clean_age.dropna()
ser_age = float(now.year) - df_clean_age.astype('float64')
ser_age = ser_age.astype('float64')
# Create bins for a histogram
bins = np.linspace(start=ser_age.min(), stop=ser_age.max(), num=13)
ser_age = pd.cut(ser_age, bins)
ser_age = ser_age.value_counts().sort_index(ascending=True)
# Generate plot
plt.figure(figsize=(19,7), facecolor='white')
plt.title("Frequency of Age")
plt.ylabel('Number of People')
plt.xlabel('Age')
ser_age.plot.bar(x='lab', y='val', rot=0, color=mycolors)
plt.show()
#Transform the types of "precipitation" and "duration_sec" to float
df['Precipitation'] = df['Precipitation'].astype('float64')
df['duration_sec'] = df['duration_sec'].astype('float64')
#Extract the dataframe which contains the rainy data
df_rainy = df[df['Precipitation']>0]
#Plot the Rain vs duration of bike rides
plt.figure(figsize=(15,5), facecolor='white')
plt.plot(df_rainy['Precipitation'],df_rainy['duration_sec'],'g.')
plt.xlabel('Precipitation')
plt.ylabel('Durantion')
plt.show()
#Rain vs number of bike rides
num_ride_rainy=df_rainy['Precipitation'].value_counts()
num_ride_rainy
df2 = num_ride_rainy.to_frame()
column = df2.columns[0]
df2['index'] = df2.index.tolist()
plt.figure(figsize=(15,5), facecolor='white')
plt.plot(df2['index'],df2['Precipitation'],'g.')
plt.xlabel('Precipitation')
plt.ylabel('Numbers of bike rides')
plt.show()
#Using group by function to generate two coloums of maximum temperature and start_date under same maximum teperature
#Group by again and calculate the mean usage of the different dates under same maximum temperature
df3=df[['start_date','Maximum']].groupby(["Maximum", "start_date"]).size().reset_index(name="usage").groupby(by='Maximum')['usage'].agg('mean').sort_values(ascending=False)
df3.sort_index().head()
#Generate the plot of Maximum teperature VS Average daily usage of bike rides
fig = plt.figure(figsize=(10,6), facecolor='white')
plt.plot(df3,'y.',label='Average numbers of bike rides')
plt.title('Maximum teperature VS Average daily usage of bike ride')
plt.xlabel('Maximum temperature',fontsize=12)
plt.ylabel('Average numbers of bike rides',fontsize=12)
plt.legend()
plt.show()
fig = plt.figure(figsize=(18,12), facecolor='white')
ax = fig.add_subplot(2,1,2)
pd.crosstab(index=df.member_gender, columns=df.bike_share_for_all_trip).loc[['Male','Female','Other']].plot(kind='bar',ax=ax)
plt.xlabel('');
fig = plt.figure(figsize=(18,12), facecolor='white')
ax = fig.add_subplot(2,1,2)
pd.crosstab(index=df.user_type, columns=df.bike_share_for_all_trip).loc[['Subscriber','Customer']].plot(kind='bar',ax=ax)
plt.xlabel('');
fig = plt.figure(figsize=(18,12), facecolor='white')
ax = fig.add_subplot(2,1,2)
pd.crosstab(index=df.member_gender, columns=df.user_type).loc[['Male','Female','Other']].plot(kind='bar',ax=ax)
plt.xlabel('');
def draw_pred_plot(y):
fig = plt.figure(figsize=(12,8), facecolor='white')
plt.plot(y_test, color = 'red', label = 'Real data')
plt.plot(y, color = 'blue', label = 'Predicted data')
plt.title('Prediction')
plt.legend()
plt.show()
df0 = df[['Average', 'Departure', 'HDD', 'CDD','Precipitation','start_date']]
df0=df0.groupby('start_date').mean()
df1=df[['Average', 'Departure', 'HDD', 'CDD','Precipitation','start_date']]
df1 = df1.groupby('start_date').count()['Average']
df0['count']=df1
df0.head()
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
dfr = df0[['Average', 'Departure', 'HDD', 'CDD','Precipitation']]
X_train, X_test, y_train, y_test = train_test_split(dfr, df0['count'].values, test_size=0.2)
lr = LinearRegression(fit_intercept=True)
lr.fit(X_train, y_train)
print('The out-of-sample R2 is', lr.score(X_test, y_test))
pd.DataFrame({'Feature': dfr.columns, 'Coefficient': lr.coef_}, columns=['Feature','Coefficient'])
(https://www.eia.gov/energyexplained/index.php?page=about_degree_days)
y = lr.predict(X_test)
draw_pred_plot(y)
r2lr = lr.score(X_test, y_test)
mselr = mean_squared_error(y, y_test)
print("R^2: ", r2lr)
print("MSE Linear Regression: ", mselr)
from sklearn import tree
# Fit the initial tree
tr = tree.DecisionTreeRegressor()
tr.fit(X_train, y_train)
r2tr = tr.score(X_test, y_test)
print('The out-of-sample R2 is', r2tr)
min_sample_split_params = list(range(3,100))
max_depth_params = list(range(3,100))
params = {'min_samples_split': min_sample_split_params, 'max_depth': max_depth_params}
tree_grid = GridSearchCV(estimator=tr, param_grid=params, scoring='r2', cv=5, n_jobs=4)
tree_grid.fit(X_train, y_train)
# Assign best estimator to variable and evaluate performance
tree = tree_grid.best_estimator_
y = tree.predict(X_test)
draw_pred_plot(y)
r2tr = tree.score(X_test, y_test)
msetr = mean_squared_error(y, y_test)
print("Tree R^2: ",r2tr)
print("MSE Tree: ", msetr)
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(tree, out_file=dot_data, filled=True, rounded=True,special_characters=True, feature_names = X_train.columns)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('Biking.png')
Image(graph.create_png())
from sklearn import ensemble as en
from sklearn import metrics
dt5 = en.AdaBoostRegressor(random_state=1, base_estimator=tree)
regr =dt5.fit(X_train, y_train)
fig=plt.figure()
y = tr.predict(X_test)
draw_pred_plot(y)
r2b = regr.score(X_test, y_test)
mseb = mean_squared_error(y, y_test)
print("Boosting R^2: ", r2b)
y = regr.predict(X_test)
print("MSE Boosting:", mseb)
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.plotly as py
import plotly.figure_factory as ff
from plotly.graph_objs import *
import plotly.graph_objs as go
init_notebook_mode()
data = [go.Bar(
x=X_test.columns.tolist(),
y=regr.feature_importances_.tolist()
)]
iplot(data)
r2json = {"Linear Regression":r2lr, "Decision Trees":r2tr, "Boosting":r2b}
fig = plt.figure(figsize=(12,8), facecolor='white')
plt.bar(*zip(*r2json.items()))
plt.xlabel('----Method----')
plt.ylabel('R^2')
plt.show()
msejson = {"Linear Regression":mselr, "Decision Trees":msetr, "Boosting":mseb}
fig = plt.figure(figsize=(12,8), facecolor='white')
plt.bar(*zip(*msejson.items()))
plt.xlabel('----Method----')
plt.ylabel('MSE')
plt.show()